In [1]:
%matplotlib inline
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gzip
import cPickle as pickle
In [2]:
filename = 'ERCOT wind data.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
ercot = pd.read_csv(fullpath, index_col=0)
In [3]:
ercot.head()
Out[3]:
In [4]:
ercot.loc[:,'Net Load (MW)'] = ercot.loc[:,'ERCOT Load, MW'] - ercot.loc[:,'Total Wind Output, MW']
In [5]:
ercot.head()
Out[5]:
In [6]:
ercot.loc[1:,'Net Load Change (MW)'] = ercot.iloc[1:,-1].values - ercot.iloc[:-1,-1].values
In [7]:
ercot.loc[:,'DATETIME'] = pd.to_datetime(ercot.index)
In [8]:
ercot.head()
Out[8]:
In [9]:
# load a zipped pickle file
# from http://stackoverflow.com/questions/18474791/decreasing-the-size-of-cpickle-objects
def load_zipped_pickle(filename):
with gzip.open(filename, 'rb') as f:
loaded_object = pickle.load(f)
return loaded_object
In [10]:
filename = 'EPA hourly dictionary.pgz'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
epaDict = load_zipped_pickle(fullpath)
In [11]:
epaDict.keys()
Out[11]:
In [12]:
def plant_gen_delta(df):
"""
For every plant in the input df, calculate the change in gross load (MW)
from the previous hour.
input:
df: dataframe of EPA clean air markets data
return:
df: concatanated list of dataframes
"""
df_list = []
for plant in df['PLANT_ID'].unique():
temp = df.loc[df['PLANT_ID'] == plant,:]
gen_change = temp.loc[:,'GROSS LOAD (MW)'].values - temp.loc[:,'GROSS LOAD (MW)'].shift(1).values
temp.loc[:,'Gen Change (MW)'] = gen_change
df_list.append(temp)
return pd.concat(df_list)
In [13]:
allEPA = pd.concat(epaDict)
In [14]:
allEPA.fillna(0, inplace=True)
allEPA.head()
Out[14]:
In [15]:
allEPA = plant_gen_delta(allEPA)
In [16]:
allEPA.reset_index(drop=True, inplace=True)
In [17]:
allEPA.tail()
Out[17]:
In [18]:
cluster_df = pd.read_csv('Cluster labels.csv')
In [19]:
merged_epa_cluster = pd.merge(allEPA, cluster_df, left_on=['PLANT_ID', 'YEAR'],
right_on=['plant_id', 'year'])
In [20]:
merged_epa_cluster.head()
Out[20]:
In [21]:
grouped_clusters = merged_epa_cluster.loc[:,['Gen Change (MW)', 'GROSS LOAD (MW)', 'DATETIME', 'cluster_id_6']].groupby(['DATETIME', 'cluster_id_6']).sum()
In [22]:
grouped_clusters.reset_index(inplace=True)
In [23]:
grouped_clusters.dtypes
Out[23]:
In [24]:
grouped_clusters_ercot = pd.merge(grouped_clusters, ercot, on='DATETIME')
In [25]:
grouped_clusters_ercot.loc[:,'year'] = grouped_clusters_ercot.loc[:,'DATETIME'].apply(lambda x: x.year)
grouped_clusters_ercot.head()
Out[25]:
In [100]:
g = sns.FacetGrid(grouped_clusters_ercot, col='year', col_wrap=3, hue='cluster_id_6')
g.map(plt.scatter, 'Net Load Change (MW)', 'Gen Change (MW)', facecolors='None')
g.set_xticklabels(rotation=30)
g.add_legend()
Out[100]:
Out[100]:
Out[100]:
Generally the figure below shows that each cluster tends to behave somewhat linearly (change in generation over the hour vs change in net load), but there are a couple exceptions.
In [109]:
sns.lmplot('Net Load Change (MW)', 'Gen Change (MW)', data=filtered_data,
col='year', row='cluster_id_6', hue='cluster_id_6', robust=True, ci=None)
Out[109]:
The figure below shows how/if ramping changes as wind generation increases from 2007 to 2015. Cluster 3 is especially interesting, because it contains high-efficiency NG plants with average capacity just under 1 GW. Over time it starts ramping up (and down) by larger amounts . Some other clusters show similar behavior, while other (like 0, 1, 2) don't change nearly as much.
In [102]:
filtered_data = grouped_clusters_ercot.loc[grouped_clusters_ercot['year'].isin([2007, 2011, 2015])]
g = sns.FacetGrid(filtered_data, col='year', row='cluster_id_6')
g.map(plt.scatter, 'Total Wind Output, MW', 'Gen Change (MW)', facecolors='None')
g.set_xticklabels(rotation=30)
g.add_legend()
Out[102]:
Out[102]:
Out[102]:
In [26]:
#Load EIA 860 from picle
eia860fname = "EIA 860.pkl"
eia860Full = os.path.join(path, eia860fname)
eia860Dict = pickle.load(open(eia860Full, "rb"))
#Add year to as a column
for k in eia860Dict.keys():
eia860Dict[k]["Year"] = k
#Flatten dictionary, rename columns, and do inner join
merged860 = pd.concat(eia860Dict)
merged860.columns = ["plant_id", "nameplate_capacity", "year"]
merged860 = cluster_df.merge(merged860, on=["plant_id", "year"])
groupCapacity = merged860.loc[:,["cluster_id_6", "year", "nameplate_capacity"]].groupby(by=["cluster_id_6", "year"]).sum()
In [27]:
groupCapacity.head()
Out[27]:
In [29]:
groupCapacity.reset_index(inplace=True)
In [30]:
groupCapacity.head()
Out[30]:
In [31]:
# Yes, I realize this is a horrible name
grouped_clusters_ercot_capacity = pd.merge(groupCapacity, grouped_clusters_ercot, on=['cluster_id_6', 'year'])
In [32]:
grouped_clusters_ercot_capacity.head()
Out[32]:
In [33]:
grouped_clusters_ercot_capacity.describe()
Out[33]:
In [34]:
filename = 'Fuel prices.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
fuel_prices = pd.read_csv(fullpath, index_col=0)
In [35]:
fuel_prices.head()
Out[35]:
In [36]:
grouped_clusters_ercot_capacity.loc[:,'Month'] = grouped_clusters_ercot_capacity.loc[:,'DATETIME'].apply(lambda x: x.month)
# Make 'year' capitalization the same
grouped_clusters_ercot_capacity = grouped_clusters_ercot_capacity.rename(columns = {'year':'Year'})
In [51]:
X = pd.merge(grouped_clusters_ercot_capacity, fuel_prices, on=['Year', 'Month'])
In [52]:
X.head()
Out[52]:
In [53]:
X.tail()
Out[53]:
In [54]:
y = X.loc[:,['DATETIME', 'cluster_id_6', 'Gen Change (MW)']]
In [55]:
y.head()
Out[55]:
In [56]:
y.describe()
Out[56]:
In [57]:
X.drop('Gen Change (MW)', axis=1, inplace=True)
In [58]:
X_fn = 'X.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)
X.to_csv(X_path)
y.to_csv(y_path)
Still need the total capacity of each cluster at that hour (determined by year).
Anything else?
X (for every cluster and every hour):
If we have time and need to improve performance, maybe add change in cluster generation over earlier timesteps (e.g. from two hours ago to one hour ago).
y (for every cluster and every hour)
In [ ]:
filename = 'EPA with hourly gen change.csv'
path = '../Clean Data'
fullpath = os.path.join(path, filename)
allEPA.to_csv(fullpath)
In early years the generation change is almost independent of the demand change. This slowly shifts over time, so that the generation is correlated with
In [98]:
plants = allEPA.loc[allEPA['PLANT_ID'].isin([298, 3439]),:]
In [99]:
plants.tail()
Out[99]:
In [32]:
plants.index = plants['DATETIME']
In [38]:
test = pd.merge(ercot, plants, left_index=True, right_index=True)
In early years the generation change is almost independent of the demand change. This slowly shifts over time, so that the generation is correlated with
In [40]:
g = sns.FacetGrid(test[test['PLANT_ID']==298], col='YEAR', col_wrap=3)
g.map(sns.regplot, 'Net Change', 'Gen Change')
g.set_xticklabels(rotation=30)
plt.suptitle('Coal plant 298', y=1.02, size=15)
Out[40]:
Out[40]:
Out[40]:
In [35]:
g = sns.FacetGrid(test[test['PLANT_ID']==3439], col='YEAR', col_wrap=3)
g.map(sns.regplot, 'Net Change', 'Gen Change')
g.set_xticklabels(rotation=30)
plt.suptitle('Natural gas plant 3439', y=1.02, size=15)
Out[35]:
Out[35]:
Out[35]:
In [ ]: